java获取行政区划编码(省市区县居委5级)

您所在的位置:网站首页 乡镇 区县 地市 省 java获取行政区划编码(省市区县居委5级)

java获取行政区划编码(省市区县居委5级)

2024-07-10 01:10| 来源: 网络整理| 查看: 265

背景

打算做一个省级联动的功能,由于网上没有找到符合自己要求的数据。便着手尝试写一个简单爬虫进行数据抓取。由于时间仓促,简单粗糙的写了份代码。(嵌套循环过多)可自行优化。复制代码,能直接运行;

大概花费一个多小时能运行完成

如果不想自己重新抓取,文章的最后有2份不同数据结构格式的sql,可直接使用;

部分城市没有5级的,例如广东省东莞市下一级是镇。

数据来源:中国统计局标准 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/

运行环境:jdk8;  

代码 org.jsoup jsoup 1.11.3 import cn.hutool.core.date.DateUtil; import cn.hutool.core.util.IdUtil; import cn.hutool.http.HttpRequest; import cn.hutool.http.HttpUtil; import lombok.var; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Date; import java.util.concurrent.atomic.AtomicInteger; public class Test { private static final Logger log = LoggerFactory.getLogger(Test.class); private static AtomicInteger atomicInteger = new AtomicInteger(0); private static AtomicInteger atomicIntegerErrorNum = new AtomicInteger(0); public static void main(String[] args) { //获取所有的数据 // test(null, null); //获取指定的城市 test("广东省", "中山市"); } private static String getHtml(String url) throws Exception { atomicInteger.incrementAndGet(); log.info("调用接口次数 :" + atomicInteger.get()); log.info("请求开始时间 ==>" + DateUtil.formatDateTime(new Date())); log.info(url); String html = null; try { html = getHtml2(url); } catch (Exception e) { e.printStackTrace(); log.info(Thread.currentThread().getName() + " = 超时。。睡5秒再重试"); Thread.sleep(1000 * 5); log.info(Thread.currentThread().getName() + " = 超时重试"); atomicIntegerErrorNum.incrementAndGet(); log.info("error num = " + atomicIntegerErrorNum.get()); try { //重试1次 html = getHtml2(url); } catch (Exception e2) { e2.printStackTrace(); log.info("error num = " + atomicIntegerErrorNum.get()); log.info(Thread.currentThread().getName() + " 第二次超时重试"); log.info(Thread.currentThread().getName() + " = 第二次超时。。睡12分钟再重试"); log.info("error num = " + atomicIntegerErrorNum.get()); Thread.sleep(1000 * 60 * 12); //重试2次 html = getHtml2(url); } } log.info("请求结束时间 ==>" + DateUtil.formatDateTime(new Date())); System.out.println(""); System.out.println(""); return html; // } } private static String getHtml2(String url) throws Exception { //链式构建请求 return HttpRequest.get(url) .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36")//头信息,多个头信息多次调用此方法即可 .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")//头信息,多个头信息多次调用此方法即可 .header("Cookie", "_trs_uv=" + IdUtil.simpleUUID() + "; SF_cookie_1=" + IdUtil.fastUUID()) .header("If-None-Match", "f32-5d4bccaa05a80-gzip") // .header("If-Modified-Since",new Date()) // .form(paramMap)//表单内容 .timeout(15000)//超时,毫秒 .execute().body(); } public static String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/"; public static void test(String appendProvide, String appendCity) { System.out.println("===============开始抓取数据=================="); var url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html"; String html = HttpUtil.get(url); var htmlDoc = Jsoup.parse(html); var selectClasses = htmlDoc.getElementsByClass("provincetr"); var startDateStr = DateUtil.formatDateTime(new Date()); long startDate = System.currentTimeMillis(); // ExecutorService executorService = new ThreadPoolExecutor(4, 5, 1L, TimeUnit.SECONDS, new ArrayBlockingQueue(4), Executors.defaultThreadFactory()); for (int i = 0; i < selectClasses.size(); i++) { int finalI = i; // executorService.execute(() -> { // System.out.println(Thread.currentThread().getName() + " " + "--->开始爬数据"); try { //一个线程跑就行 startProvide(selectClasses, finalI, baseUrl, appendProvide, appendCity); } catch (Exception e) { e.printStackTrace(); } } System.out.println("耗时 =》"); System.out.println((System.currentTimeMillis() - startDate) / 1000); System.out.println("开始时间 ==> " + startDateStr); System.out.println("结束时间 ==>" + DateUtil.formatDateTime(new Date())); } /** * @param selectClasses * @param i * @param baseUrl * @param appendProvide 需要查询的省 ,null表示查询所有 * @throws Exception */ private static void startProvide(Elements selectClasses, Integer i, String baseUrl, String appendProvide, String appendCity) throws Exception { var provideCodes = selectClasses.get(i).children(); //1.省份 provincetr for (int provideCodeIndex = 0; provideCodeIndex < provideCodes.size(); provideCodeIndex++) { var provideCodeUrl = provideCodes.get(provideCodeIndex).select("a").attr("href"); var provideName = provideCodes.get(provideCodeIndex).select("a").text(); // System.out.println("省份 = " + provideName); if (!StringUtils.isBlank(provideCodeUrl)) { if (StringUtils.isBlank(appendProvide)) { provide(provideCodeUrl, provideName, null); } else { if (appendProvide.equals(provideName)) { provide(provideCodeUrl, provideName, appendCity); } } } } } /** * 通过省,市获取下面的数据 * * @param provideCodeUrl */ public static void provide(String provideCodeUrl, String provideName, String appendCity) throws Exception { var provideCode = provideCodeUrl.split("\\.")[0]; // String gotoCityHtml = HttpUtil.get(baseUrl + provideCodeUrl); String gotoCityHtml = getHtml(baseUrl + provideCodeUrl); var cityHtmlDoc = Jsoup.parse(gotoCityHtml); Elements selectCityClass = cityHtmlDoc.select(".citytr"); // var len = provideCode.length(); // var provideCode2 = provideCode; // if (len < 6) { // len = 6 - len; // for (int l = 0; l < len; l++) { // provideCode2 += "0"; // } // } if (StringUtils.isBlank(appendCity)) { //todo 保存到数据库 保存省 city(selectCityClass, provideName, provideCode, null); } else { city(selectCityClass, provideName, provideCode, appendCity); } } /** * 城市 * * @param selectCityClass * @param provideName * @param provideCode * @throws Exception */ private static void city(Elements selectCityClass, String provideName, String provideCode, String appendCity) throws Exception { //2.城市 citytr for (int cityIndex = 0; cityIndex < selectCityClass.size(); cityIndex++) { var gotoCountyUrl = selectCityClass.get(cityIndex).select("td").get(1).select("a").attr("href"); var cityName = selectCityClass.get(cityIndex).select("td").get(1).select("a").text(); System.out.println("城市 = " + cityName); if (StringUtils.isBlank(gotoCountyUrl)) { continue; } if (StringUtils.isBlank(appendCity)) { appendCity(gotoCountyUrl, provideName, provideCode, cityName); } else { if (appendCity.equals(cityName)) { appendCity(gotoCountyUrl, provideName, provideCode, cityName); } } } } private static void appendCity(String gotoCountyUrl, String provideName, String provideCode, String cityName) throws Exception { // String countytr = HttpUtil.get(baseUrl + gotoCountyUrl); boolean flag = true; String countytr = getHtml(baseUrl + gotoCountyUrl); var countytrDoc = Jsoup.parse(countytr); Elements countyClass = countytrDoc.select(".countytr"); if (countyClass.size() == 0) { flag = false; countyClass = countytrDoc.select(".towntr"); } var strCityUrl = gotoCountyUrl.split("/"); var cityCode = strCityUrl[strCityUrl.length - 1].split("\\.")[0]; var cityCodeLen = cityCode.length(); var cityCode2 = cityCode; if (cityCodeLen < 6) { cityCodeLen = 6 - cityCodeLen; for (int l = 0; l < cityCodeLen; l++) { cityCode2 += "0"; } } if (cityName.equals("市辖区")) { cityName = provideName; } cityCode = cityCode2; if (!flag) { town(countyClass, provideName, provideCode, cityCode , flag ); } else { //todo 保存到数据库 保存城市 county(countyClass, provideName, provideCode); } } /** * 街道 * * @param countyClass * @param provideCode */ private static void county(Elements countyClass, String provideName, String provideCode) throws Exception { //3.县区 countytr for (int county = 0; county < countyClass.size(); county++) { var gotoTownUrl = countyClass.get(county).select("td").get(1).select("a").attr("href"); var countyName = countyClass.get(county).select("td").get(1).select("a").text(); System.out.println("县区 = " + countyName); if (StringUtils.isBlank(gotoTownUrl)) { continue; } //todo 保存到数据库 String towntr = getHtml(baseUrl + provideCode + "/" + gotoTownUrl); var townDoc = Jsoup.parse(towntr); Elements townClass = townDoc.select(".towntr"); var gotoTownCode = gotoTownUrl.split("/")[0]; // Thread.sleep(3000); town(townClass, provideName, provideCode, gotoTownCode, true); } } /** * @param townClass * @param provideName * @param provideCode * @param gotoTownCode * @param flag 是否地级市。true是,false否 * @throws Exception */ private static void town(Elements townClass, String provideName, String provideCode, String gotoTownCode, boolean flag) throws Exception { //4.街道。镇 towntr for (int town = 0; town < townClass.size(); town++) { //towntr var gotoVillageHref = townClass.get(town).select("td").get(1).select("a").attr("href"); var townName = townClass.get(town).select("td").get(1).select("a").text(); System.out.println("街道。镇 = " + townName); if (StringUtils.isBlank(gotoVillageHref)) { continue; } //todo 保存到数据库 //居委会 // String villageStr = HttpUtil.get(baseUrl + provideCode + "/" + gotoTownCode + "/" + gotoVillageHref); String villageStr = null; if(flag){ villageStr = getHtml(baseUrl + provideCode + "/" + gotoTownCode + "/" + gotoVillageHref); }else{ villageStr = getHtml(baseUrl + provideCode + "/"+ gotoVillageHref); } var villageDoc = Jsoup.parse(villageStr); Elements villagetr = villageDoc.select(".villagetr"); for (int villageIndex = 0; villageIndex < villagetr.size(); villageIndex++) { var tds = villagetr.get(villageIndex).select("td");//[0].text(); var villageCode = tds.get(0).text(); var villageType = tds.get(1).text(); var villageName = tds.get(2).text(); System.out.println(villageCode + " " + villageType + " " + villageName); //todo 保存到数据库 } } } }

附录

2021全国行政区域4级,5级(不含港澳台)-MySQL文档类资源-CSDN下载

免责声明:本文章仅用于学习参考



【本文地址】


今日新闻


推荐新闻


CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3